<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>ArXiv CS.CV Papers (Image/Video Generation) - May 01, 2025</title>
    <script src="https://cdn.tailwindcss.com"></script>
    <script src="https://cdnjs.cloudflare.com/ajax/libs/framer-motion/10.16.4/framer-motion.dev.js"></script>
    <!-- Example using Font Awesome (replace with your preferred icon library if needed) -->
    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/all.min.css">
    <style>
        @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;600;700&display=swap');

        :root {
            /* New Palette: Light, Clean, Futuristic with Teal/Aqua accents */
            --bg-color: #f8fafc; /* Tailwind slate-50 (Very Light Gray) */
            --card-bg-color: #ffffff; /* White */
            --text-color: #1e293b; /* Tailwind slate-800 (Dark Gray-Blue) */
            --text-muted-color: #64748b; /* Tailwind slate-500 (Medium Gray-Blue) */
            --header-color: #0f172a; /* Tailwind slate-900 (Very Dark Blue) */
            --highlight-primary: #14b8a6; /* Tailwind teal-500 */
            --highlight-secondary: #67e8f9; /* Tailwind cyan-300 */
            --border-color: #e2e8f0; /* Tailwind slate-200 (Light Gray) */
            --shadow-color: rgba(15, 23, 42, 0.08); /* Subtle shadow based on slate-900 */
        }

        body {
            background-color: var(--bg-color);
            color: var(--text-color);
            font-family: 'Inter', sans-serif;
            overflow-x: hidden; /* Prevent horizontal scroll */
            line-height: 1.6;
        }

        .bento-grid {
            display: grid;
            gap: 1.5rem; /* Tailwind gap-6 */
            grid-template-columns: 1fr; /* Force single column */
            padding-bottom: 4rem; /* Add padding at the bottom */
        }

        .bento-item {
            /* Apply semi-transparent white background and blur */
            background-color: rgba(255, 255, 255, 0.7); /* White with 70% opacity */
            backdrop-filter: blur(10px); /* Apply blur effect */
            -webkit-backdrop-filter: blur(10px); /* Safari prefix */
            border-radius: 1rem; /* Slightly larger radius */
            padding: 1.75rem; /* Slightly more padding */
            border: 1px solid rgba(226, 232, 240, 0.5); /* Lighter border with transparency */
            box-shadow: 0 4px 12px var(--shadow-color);
            transition: transform 0.3s ease-out, box-shadow 0.3s ease-out, background-color 0.3s ease-out;
            overflow: hidden; /* Ensure content doesn't overflow */
            position: relative; /* For potential pseudo-elements */
        }

        /* Removed ::before pseudo-element for a cleaner look */


        .bento-item:hover {
            transform: translateY(-6px);
            box-shadow: 0 10px 20px var(--shadow-color), 0 4px 8px rgba(15, 23, 42, 0.06); /* Adjusted hover shadow */
        }

        .paper-title {
            font-size: 1.125rem; /* Tailwind text-lg */
            font-weight: 600; /* Tailwind font-semibold */
            color: var(--highlight-primary); /* Use new primary highlight */
            margin-bottom: 0.75rem; /* Tailwind mb-3 */
            line-height: 1.4;
        }

        .paper-summary {
            font-size: 0.875rem; /* Tailwind text-sm */
            color: var(--text-muted-color);
            margin-bottom: 1.25rem; /* Tailwind mb-5 */
            line-height: 1.6;
        }

        .paper-link {
            display: inline-flex; /* Use flex for icon alignment */
            align-items: center;
            font-size: 0.875rem; /* Tailwind text-sm */
            font-weight: 600;
            color: var(--highlight-primary);
            text-decoration: none;
            padding: 0.5rem 1rem; /* Add padding */
            border-radius: 0.5rem; /* Slightly rounder */
            background-color: rgba(20, 184, 166, 0.08); /* Subtle teal background */
            border: 1px solid rgba(20, 184, 166, 0.2);
            transition: background-color 0.3s ease, color 0.3s ease, transform 0.2s ease;
        }

        .paper-link i {
            margin-right: 0.5rem; /* Tailwind mr-2 */
            transition: transform 0.3s ease;
        }

        .paper-link:hover {
            background-color: rgba(20, 184, 166, 0.15);
            color: #0d9488; /* Darker teal on hover */
            transform: translateY(-1px);
        }
        .paper-link:hover i {
             transform: translateX(2px);
        }

        .paper-authors {
            font-size: 0.75rem; /* Tailwind text-xs */
            color: var(--text-muted-color);
            margin-top: 1rem; /* Tailwind mt-4 */
            font-style: italic;
        }

        .header {
            text-align: center;
            margin-bottom: 3rem; /* Tailwind mb-12 */
            padding-top: 3rem; /* Tailwind pt-12 */
        }

        .header h1 {
            font-size: 2.5rem; /* Tailwind text-4xl or 5xl */
            font-weight: 700; /* Tailwind font-bold */
            color: var(--header-color);
            letter-spacing: -0.025em; /* Tailwind tracking-tight */
            margin-bottom: 0.5rem;
            /* Optional: Add a subtle text gradient */
            /* background: linear-gradient(90deg, var(--highlight-primary), var(--highlight-secondary)); */
            /* -webkit-background-clip: text; */
            /* -webkit-text-fill-color: transparent; */
        }

        .header p {
            font-size: 1.125rem; /* Tailwind text-lg */
            color: var(--text-muted-color);
            margin-top: 0.5rem; /* Tailwind mt-2 */
            max-width: 600px;
            margin-left: auto;
            margin-right: auto;
        }

        .footer {
            text-align: center;
            color: var(--text-muted-color);
            font-size: 0.875rem; /* Tailwind text-sm */
            padding-top: 2rem;
            padding-bottom: 2rem; /* Tailwind py-8 */
            border-top: 1px solid var(--border-color);
            margin-top: 4rem;
        }

        /* Simple line graphic element (optional) */
        .line-graphic {
            height: 1px; /* Thinner line */
            background: linear-gradient(90deg, rgba(20, 184, 166, 0), var(--highlight-primary), rgba(20, 184, 166, 0));
            opacity: 0.6;
            margin: 1.5rem 0; /* Adjust margin */
        }

        /* Framer Motion requires the script, styles enhance appearance */
        [data-motion-element] {
             /* Base styles for elements animated by Framer Motion */
        }

        .paper-tldr {
            font-size: 0.95rem; /* Slightly bigger than summary */
            color: #475569; /* Changed to Tailwind slate-600 (slightly darker than summary) */
            margin-top: 0.75rem; /* Tailwind mt-3 */
            margin-bottom: 0.75rem; /* Tailwind mb-2 */
            /* font-style: italic; */
            font-weight: bold;
        }

        .paper-rating {
            margin-top: 1rem; /* Tailwind mt-4 */
            margin-bottom: 1rem; /* Tailwind mb-4 */
            color: #f59e0b; /* Tailwind amber-500 */
        }

        .paper-rating i {
            margin-right: 0.125rem; /* Tailwind mr-0.5 */
        }

        /* Apply consistent star color to sub-ratings */
        .paper-sub-ratings .rating-item i {
            color: #f59e0b; /* Match overall rating star color (amber-500) */
            margin-right: 0.125rem; /* Consistent spacing */
        }

    </style>
</head>
<body class="container mx-auto px-4 antialiased">

    <motion.div
        initial="{ opacity: 0, y: -30 }"
        animate="{ opacity: 1, y: 0 }"
        transition="{ duration: 0.6, ease: 'easeOut' }"
        class="header"
        data-motion-element
    >
        <h1>AIGC Daily Papers</h1>
        <p>Daily papers related to Image/Video/Multimodal Generation from cs.CV</p>
        <p>May 01, 2025</p>
        <div class="line-graphic mt-4 mb-8 mx-auto w-1/4"></div> <!-- Added line graphic -->
    </motion.div>

    <div class="bento-grid" id="paper-grid">
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.0, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">ReVision: High-Quality, Low-Cost Video Generation with Explicit 3D Physics Modeling for Complex Motion and Interaction</h2>
            <p class="paper-summary">In recent years, video generation has seen significant advancements. However,
challenges still persist in generating complex motions and interactions. To
address these challenges, we introduce ReVision, a plug-and-play framework that
explicitly integrates parameterized 3D physical knowledge into a pretrained
conditional video generation model, significantly enhancing its ability to
generate high-quality videos with complex motion and interactions.
Specifically, ReVision consists of three stages. First, a video diffusion model
is used to generate a coarse video. Next, we extract a set of 2D and 3D
features from the coarse video to construct a 3D object-centric representation,
which is then refined by our proposed parameterized physical prior model to
produce an accurate 3D motion sequence. Finally, this refined motion sequence
is fed back into the same video diffusion model as additional conditioning,
enabling the generation of motion-consistent videos, even in scenarios
involving complex actions and interactions. We validate the effectiveness of
our approach on Stable Video Diffusion, where ReVision significantly improves
motion fidelity and coherence. Remarkably, with only 1.5B parameters, it even
outperforms a state-of-the-art video generation model with over 13B parameters
on complex video generation by a substantial margin. Our results suggest that,
by incorporating 3D physical knowledge, even a relatively small video diffusion
model can generate complex motions and interactions with greater realism and
controllability, offering a promising solution for physically plausible video
generation.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces revision, a framework that enhances video generation by integrating 3d physical knowledge into a pretrained video diffusion model, leading to improved motion fidelity and performance compared to larger models.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文提出了一种名为revision的框架，通过将3d物理知识整合到预训练的视频扩散模型中来增强视频生成，从而提高了运动保真度，并且性能优于更大的模型。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(10/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star-half-alt"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(9/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.21855v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Qihao Liu, Ju He, Qihang Yu, Liang-Chieh Chen, Alan Yuille</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.05, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">A Survey of Interactive Generative Video</h2>
            <p class="paper-summary">Interactive Generative Video (IGV) has emerged as a crucial technology in
response to the growing demand for high-quality, interactive video content
across various domains. In this paper, we define IGV as a technology that
combines generative capabilities to produce diverse high-quality video content
with interactive features that enable user engagement through control signals
and responsive feedback. We survey the current landscape of IGV applications,
focusing on three major domains: 1) gaming, where IGV enables infinite
exploration in virtual worlds; 2) embodied AI, where IGV serves as a
physics-aware environment synthesizer for training agents in multimodal
interaction with dynamically evolving scenes; and 3) autonomous driving, where
IGV provides closed-loop simulation capabilities for safety-critical testing
and validation. To guide future development, we propose a comprehensive
framework that decomposes an ideal IGV system into five essential modules:
Generation, Control, Memory, Dynamics, and Intelligence. Furthermore, we
systematically analyze the technical challenges and future directions in
realizing each component for an ideal IGV system, such as achieving real-time
generation, enabling open-domain control, maintaining long-term coherence,
simulating accurate physics, and integrating causal reasoning. We believe that
this systematic analysis will facilitate future research and development in the
field of IGV, ultimately advancing the technology toward more sophisticated and
practical applications.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: this paper surveys interactive generative video (igv) techniques across gaming, embodied ai, and autonomous driving. it proposes a framework and analyzes challenges for future igv development.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 本文综述了交互式生成视频（igv）技术在游戏、具身人工智能和自动驾驶等领域的应用。它提出了一个框架，并分析了igv未来发展面临的挑战。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(8/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.21853v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Jiwen Yu, Yiran Qin, Haoxuan Che, Quande Liu, Xintao Wang, Pengfei Wan, Di Zhang, Kun Gai, Hao Chen, Xihui Liu</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.1, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Why Compress What You Can Generate? When GPT-4o Generation Ushers in Image Compression Fields</h2>
            <p class="paper-summary">The rapid development of AIGC foundation models has revolutionized the
paradigm of image compression, which paves the way for the abandonment of most
pixel-level transform and coding, compelling us to ask: why compress what you
can generate if the AIGC foundation model is powerful enough to faithfully
generate intricate structure and fine-grained details from nothing more than
some compact descriptors, i.e., texts, or cues. Fortunately, recent GPT-4o
image generation of OpenAI has achieved impressive cross-modality generation,
editing, and design capabilities, which motivates us to answer the above
question by exploring its potential in image compression fields. In this work,
we investigate two typical compression paradigms: textual coding and multimodal
coding (i.e., text + extremely low-resolution image), where all/most
pixel-level information is generated instead of compressing via the advanced
GPT-4o image generation function. The essential challenge lies in how to
maintain semantic and structure consistency during the decoding process. To
overcome this, we propose a structure raster-scan prompt engineering mechanism
to transform the image into textual space, which is compressed as the condition
of GPT-4o image generation. Extensive experiments have shown that the
combination of our designed structural raster-scan prompts and GPT-4o's image
generation function achieved the impressive performance compared with recent
multimodal/generative image compression at ultra-low bitrate, further
indicating the potential of AIGC generation in image compression fields.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: this paper explores using gpt-4o for image compression by generating images from text prompts and low-resolution images, achieving comparable performance to existing methods at ultra-low bitrates.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 本文探讨了使用gpt-4o通过文本提示和低分辨率图像生成图像来进行图像压缩的方法，并在超低比特率下实现了与现有方法相当的性能。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(8/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.21814v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Yixin Gao, Xiaohan Pan, Xin Li, Zhibo Chen</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.15000000000000002, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">HoloTime: Taming Video Diffusion Models for Panoramic 4D Scene Generation</h2>
            <p class="paper-summary">The rapid advancement of diffusion models holds the promise of
revolutionizing the application of VR and AR technologies, which typically
require scene-level 4D assets for user experience. Nonetheless, existing
diffusion models predominantly concentrate on modeling static 3D scenes or
object-level dynamics, constraining their capacity to provide truly immersive
experiences. To address this issue, we propose HoloTime, a framework that
integrates video diffusion models to generate panoramic videos from a single
prompt or reference image, along with a 360-degree 4D scene reconstruction
method that seamlessly transforms the generated panoramic video into 4D assets,
enabling a fully immersive 4D experience for users. Specifically, to tame video
diffusion models for generating high-fidelity panoramic videos, we introduce
the 360World dataset, the first comprehensive collection of panoramic videos
suitable for downstream 4D scene reconstruction tasks. With this curated
dataset, we propose Panoramic Animator, a two-stage image-to-video diffusion
model that can convert panoramic images into high-quality panoramic videos.
Following this, we present Panoramic Space-Time Reconstruction, which leverages
a space-time depth estimation method to transform the generated panoramic
videos into 4D point clouds, enabling the optimization of a holistic 4D
Gaussian Splatting representation to reconstruct spatially and temporally
consistent 4D scenes. To validate the efficacy of our method, we conducted a
comparative analysis with existing approaches, revealing its superiority in
both panoramic video generation and 4D scene reconstruction. This demonstrates
our method's capability to create more engaging and realistic immersive
environments, thereby enhancing user experiences in VR and AR applications.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces holotime, a framework that uses video diffusion models to generate panoramic videos from prompts or images, which are then reconstructed into 4d scene assets for immersive vr/ar experiences, using a new dataset and novel method for space-time reconstruction.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文介绍了holotime，一个利用视频扩散模型从提示或图像生成全景视频的框架，然后将这些视频重构为4d场景资产，用于沉浸式vr/ar体验。该框架使用了一个新的数据集和一种用于时空重建的新方法。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(8/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.21650v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Haiyang Zhou, Wangbo Yu, Jiawen Guan, Xinhua Cheng, Yonghong Tian, Li Yuan</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.2, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">MagicPortrait: Temporally Consistent Face Reenactment with 3D Geometric Guidance</h2>
            <p class="paper-summary">In this paper, we propose a method for video face reenactment that integrates
a 3D face parametric model into a latent diffusion framework, aiming to improve
shape consistency and motion control in existing video-based face generation
approaches. Our approach employs the FLAME (Faces Learned with an Articulated
Model and Expressions) model as the 3D face parametric representation,
providing a unified framework for modeling face expressions and head pose. This
enables precise extraction of detailed face geometry and motion features from
driving videos. Specifically, we enhance the latent diffusion model with rich
3D expression and detailed pose information by incorporating depth maps, normal
maps, and rendering maps derived from FLAME sequences. A multi-layer face
movements fusion module with integrated self-attention mechanisms is used to
combine identity and motion latent features within the spatial domain. By
utilizing the 3D face parametric model as motion guidance, our method enables
parametric alignment of face identity between the reference image and the
motion captured from the driving video. Experimental results on benchmark
datasets show that our method excels at generating high-quality face animations
with precise expression and head pose variation modeling. In addition, it
demonstrates strong generalization performance on out-of-domain images. Code is
publicly available at https://github.com/weimengting/MagicPortrait.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: magicportrait introduces a novel video face reenactment method that integrates a 3d face parametric model (flame) into a latent diffusion framework for improved shape consistency and motion control, demonstrating strong results on benchmark datasets and out-of-domain images.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: magicportrait 提出了一种新的视频人脸重演方法，该方法将 3d 人脸参数模型 (flame) 集成到潜在扩散框架中，以提高形状一致性和运动控制能力，并在基准数据集和域外图像上展示了强大的结果。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(8/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.21497v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Mengting Wei, Yante Li, Tuomas Varanka, Yan Jiang, Licai Sun, Guoying Zhao</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.25, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">GarmentDiffusion: 3D Garment Sewing Pattern Generation with Multimodal Diffusion Transformers</h2>
            <p class="paper-summary">Garment sewing patterns are fundamental design elements that bridge the gap
between design concepts and practical manufacturing. The generative modeling of
sewing patterns is crucial for creating diversified garments. However, existing
approaches are limited either by reliance on a single input modality or by
suboptimal generation efficiency. In this work, we present
\textbf{\textit{GarmentDiffusion}}, a new generative model capable of producing
centimeter-precise, vectorized 3D sewing patterns from multimodal inputs (text,
image, and incomplete sewing pattern). Our method efficiently encodes 3D sewing
pattern parameters into compact edge token representations, achieving a
sequence length that is $\textbf{10}\times$ shorter than that of the
autoregressive SewingGPT in DressCode. By employing a diffusion transformer, we
simultaneously denoise all edge tokens along the temporal axis, while
maintaining a constant number of denoising steps regardless of dataset-specific
edge and panel statistics. With all combination of designs of our model, the
sewing pattern generation speed is accelerated by $\textbf{100}\times$ compared
to SewingGPT. We achieve new state-of-the-art results on DressCodeData, as well
as on the largest sewing pattern dataset, namely GarmentCodeData. The project
website is available at https://shenfu-research.github.io/Garment-Diffusion/.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: garmentdiffusion introduces a multimodal diffusion transformer for generating 3d garment sewing patterns from text, images, and incomplete patterns, achieving significant speedups and state-of-the-art results compared to existing methods.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: garmentdiffusion 提出了一种多模态扩散 transformer，用于从文本、图像和不完整的图案生成 3d 服装缝纫图案，与现有方法相比，实现了显著的加速和最先进的结果。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(8/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.21476v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Xinyu Li, Qi Yao, Yuanda Wang</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.30000000000000004, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Sparse-to-Sparse Training of Diffusion Models</h2>
            <p class="paper-summary">Diffusion models (DMs) are a powerful type of generative models that have
achieved state-of-the-art results in various image synthesis tasks and have
shown potential in other domains, such as natural language processing and
temporal data modeling. Despite their stable training dynamics and ability to
produce diverse high-quality samples, DMs are notorious for requiring
significant computational resources, both in the training and inference stages.
Previous work has focused mostly on increasing the efficiency of model
inference. This paper introduces, for the first time, the paradigm of
sparse-to-sparse training to DMs, with the aim of improving both training and
inference efficiency. We focus on unconditional generation and train sparse DMs
from scratch (Latent Diffusion and ChiroDiff) on six datasets using three
different methods (Static-DM, RigL-DM, and MagRan-DM) to study the effect of
sparsity in model performance. Our experiments show that sparse DMs are able to
match and often outperform their Dense counterparts, while substantially
reducing the number of trainable parameters and FLOPs. We also identify safe
and effective values to perform sparse-to-sparse training of DMs.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: this paper introduces sparse-to-sparse training for diffusion models to improve training and inference efficiency, showing that sparse models can match or outperform dense models with fewer resources.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 本文介绍了扩散模型的稀疏到稀疏训练方法，旨在提高训练和推理效率，实验表明稀疏模型可以用更少的资源达到甚至超过密集模型的性能。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(8/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.21380v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Inês Cardoso Oliveira, Decebal Constantin Mocanu, Luis A. Leiva</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.35000000000000003, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Nexus-Gen: A Unified Model for Image Understanding, Generation, and Editing</h2>
            <p class="paper-summary">Unified multimodal large language models (MLLMs) aim to integrate multimodal
understanding and generation abilities through a single framework. Despite
their versatility, existing open-source unified models exhibit performance gaps
against domain-specific architectures. To bridge this gap, we present
Nexus-Gen, a unified model that synergizes the language reasoning capabilities
of LLMs with the image synthesis power of diffusion models. To align the
embedding space of the LLM and diffusion model, we conduct a dual-phase
alignment training process. (1) The autoregressive LLM learns to predict image
embeddings conditioned on multimodal inputs, while (2) the vision decoder is
trained to reconstruct high-fidelity images from these embeddings. During
training the LLM, we identified a critical discrepancy between the
autoregressive paradigm's training and inference phases, where error
accumulation in continuous embedding space severely degrades generation
quality. To avoid this issue, we introduce a prefilled autoregression strategy
that prefills input sequence with position-embedded special tokens instead of
continuous embeddings. Through dual-phase training, Nexus-Gen has developed the
integrated capability to comprehensively address the image understanding,
generation and editing tasks. All models, datasets, and codes are published at
https://github.com/modelscope/Nexus-Gen.git to facilitate further advancements
across the field.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: nexus-gen is a unified multimodal large language model (mllm) that integrates image understanding, generation, and editing using a dual-phase training approach to align llm and diffusion model embedding spaces, addressing performance gaps in existing open-source unified models.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: nexus-gen是一个统一的多模态大型语言模型（mllm），它使用双阶段训练方法整合了图像理解、生成和编辑功能，以对齐llm和扩散模型的嵌入空间，从而弥补了现有开源统一模型中的性能差距。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(8/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.21356v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Hong Zhang, Zhongjie Duan, Xingjun Wang, Yingda Chen, Yuze Zhao, Yu Zhang</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.4, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">UniBiomed: A Universal Foundation Model for Grounded Biomedical Image Interpretation</h2>
            <p class="paper-summary">Multi-modal interpretation of biomedical images opens up novel opportunities
in biomedical image analysis. Conventional AI approaches typically rely on
disjointed training, i.e., Large Language Models (LLMs) for clinical text
generation and segmentation models for target extraction, which results in
inflexible real-world deployment and a failure to leverage holistic biomedical
information. To this end, we introduce UniBiomed, the first universal
foundation model for grounded biomedical image interpretation. UniBiomed is
based on a novel integration of Multi-modal Large Language Model (MLLM) and
Segment Anything Model (SAM), which effectively unifies the generation of
clinical texts and the segmentation of corresponding biomedical objects for
grounded interpretation. In this way, UniBiomed is capable of tackling a wide
range of biomedical tasks across ten diverse biomedical imaging modalities. To
develop UniBiomed, we curate a large-scale dataset comprising over 27 million
triplets of images, annotations, and text descriptions across ten imaging
modalities. Extensive validation on 84 internal and external datasets
demonstrated that UniBiomed achieves state-of-the-art performance in
segmentation, disease recognition, region-aware diagnosis, visual question
answering, and report generation. Moreover, unlike previous models that rely on
clinical experts to pre-diagnose images and manually craft precise textual or
visual prompts, UniBiomed can provide automated and end-to-end grounded
interpretation for biomedical image analysis. This represents a novel paradigm
shift in clinical workflows, which will significantly improve diagnostic
efficiency. In summary, UniBiomed represents a novel breakthrough in biomedical
AI, unlocking powerful grounded interpretation capabilities for more accurate
and efficient biomedical image analysis.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces unibiomed, a universal foundation model integrating mllm and sam for grounded biomedical image interpretation, achieving state-of-the-art performance across diverse tasks and modalities with a large-scale curated dataset.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文介绍了一种通用基础模型unibiomed，它集成了mllm和sam，用于基于大型数据集的生物医学图像解释，并在各种任务和模式下实现了最先进的性能。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(8/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.21336v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Linshan Wu, Yuxiang Nie, Sunan He, Jiaxin Zhuang, Hao Chen</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.45, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">AGHI-QA: A Subjective-Aligned Dataset and Metric for AI-Generated Human Images</h2>
            <p class="paper-summary">The rapid development of text-to-image (T2I) generation approaches has
attracted extensive interest in evaluating the quality of generated images,
leading to the development of various quality assessment methods for
general-purpose T2I outputs. However, existing image quality assessment (IQA)
methods are limited to providing global quality scores, failing to deliver
fine-grained perceptual evaluations for structurally complex subjects like
humans, which is a critical challenge considering the frequent anatomical and
textural distortions in AI-generated human images (AGHIs). To address this gap,
we introduce AGHI-QA, the first large-scale benchmark specifically designed for
quality assessment of AGHIs. The dataset comprises 4,000 images generated from
400 carefully crafted text prompts using 10 state of-the-art T2I models. We
conduct a systematic subjective study to collect multidimensional annotations,
including perceptual quality scores, text-image correspondence scores, visible
and distorted body part labels. Based on AGHI-QA, we evaluate the strengths and
weaknesses of current T2I methods in generating human images from multiple
dimensions. Furthermore, we propose AGHI-Assessor, a novel quality metric that
integrates the large multimodal model (LMM) with domain-specific human features
for precise quality prediction and identification of visible and distorted body
parts in AGHIs. Extensive experimental results demonstrate that AGHI-Assessor
showcases state-of-the-art performance, significantly outperforming existing
IQA methods in multidimensional quality assessment and surpassing leading LMMs
in detecting structural distortions in AGHIs.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces aghi-qa, a new dataset and metric (aghi-assessor) for evaluating the quality of ai-generated human images, addressing the limitations of existing iqa methods in capturing fine-grained structural details and distortions.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文介绍了aghi-qa，一个新的数据集和评估指标 (aghi-assessor) 用于评估ai生成的人像质量，解决了现有图像质量评估方法在捕捉细粒度结构细节和失真方面的局限性。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(8/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.21308v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Yunhao Li, Sijing Wu, Wei Sun, Zhichao Zhang, Yucheng Zhu, Zicheng Zhang, Huiyu Duan, Xiongkuo Min, Guangtao Zhai</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.5, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Can We Achieve Efficient Diffusion without Self-Attention? Distilling Self-Attention into Convolutions</h2>
            <p class="paper-summary">Contemporary diffusion models built upon U-Net or Diffusion Transformer (DiT)
architectures have revolutionized image generation through transformer-based
attention mechanisms. The prevailing paradigm has commonly employed
self-attention with quadratic computational complexity to handle global spatial
relationships in complex images, thereby synthesizing high-fidelity images with
coherent visual semantics.Contrary to conventional wisdom, our systematic
layer-wise analysis reveals an interesting discrepancy: self-attention in
pre-trained diffusion models predominantly exhibits localized attention
patterns, closely resembling convolutional inductive biases. This suggests that
global interactions in self-attention may be less critical than commonly
assumed.Driven by this, we propose \(\Delta\)ConvFusion to replace conventional
self-attention modules with Pyramid Convolution Blocks
(\(\Delta\)ConvBlocks).By distilling attention patterns into localized
convolutional operations while keeping other components frozen,
\(\Delta\)ConvFusion achieves performance comparable to transformer-based
counterparts while reducing computational cost by 6929$\times$ and surpassing
LinFusion by 5.42$\times$ in efficiency--all without compromising generative
fidelity.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: this paper proposes replacing self-attention in diffusion models with a convolutional approach (Δconvfusion) that achieves comparable performance with significantly reduced computational cost, suggesting self-attention may be less crucial than currently believed.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 本文提出用卷积方法(Δconvfusion)替代扩散模型中的自注意力机制，该方法在计算成本显著降低的情况下，实现了可比的性能，这表明自注意力可能没有当前认为的那么重要。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(8/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.21292v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: ZiYi Dong, Chengxing Zhou, Weijian Deng, Pengxu Wei, Xiangyang Ji, Liang Lin</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.55, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Anatomical Similarity as a New Metric to Evaluate Brain Generative Models</h2>
            <p class="paper-summary">Generative models enhance neuroimaging through data augmentation, quality
improvement, and rare condition studies. Despite advances in realistic
synthetic MRIs, evaluations focus on texture and perception, lacking
sensitivity to crucial anatomical fidelity. This study proposes a new metric,
called WASABI (Wasserstein-Based Anatomical Brain Index), to assess the
anatomical realism of synthetic brain MRIs. WASABI leverages \textit{SynthSeg},
a deep learning-based brain parcellation tool, to derive volumetric measures of
brain regions in each MRI and uses the multivariate Wasserstein distance to
compare distributions between real and synthetic anatomies. Based on controlled
experiments on two real datasets and synthetic MRIs from five generative
models, WASABI demonstrates higher sensitivity in quantifying anatomical
discrepancies compared to traditional image-level metrics, even when synthetic
images achieve near-perfect visual quality. Our findings advocate for shifting
the evaluation paradigm beyond visual inspection and conventional metrics,
emphasizing anatomical fidelity as a crucial benchmark for clinically
meaningful brain MRI synthesis. Our code is available at
https://github.com/BahramJafrasteh/wasabi-mri.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces wasabi, a new metric for evaluating the anatomical accuracy of synthetic brain mris using wasserstein distance on brain region volumes, demonstrating its superior sensitivity compared to image-level metrics.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文介绍了一种名为wasabi 的新指标，用于评估合成脑部 mri 的解剖学准确性，该指标使用 wasserstein 距离衡量脑区体积，并证明其比图像级指标具有更高的灵敏度。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(6/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star-half-alt"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(7/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.21771v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Bahram Jafrasteh, Wei Peng, Cheng Wan, Yimin Luo, Ehsan Adeli, Qingyu Zhao</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.6000000000000001, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">VividListener: Expressive and Controllable Listener Dynamics Modeling for Multi-Modal Responsive Interaction</h2>
            <p class="paper-summary">Generating responsive listener head dynamics with nuanced emotions and
expressive reactions is crucial for practical dialogue modeling in various
virtual avatar animations. Previous studies mainly focus on the direct
short-term production of listener behavior. They overlook the fine-grained
control over motion variations and emotional intensity, especially in
long-sequence modeling. Moreover, the lack of long-term and large-scale paired
speaker-listener corpora including head dynamics and fine-grained
multi-modality annotations (e.g., text-based expression descriptions, emotional
intensity) also limits the application of dialogue modeling.Therefore, we first
newly collect a large-scale multi-turn dataset of 3D dyadic conversation
containing more than 1.4M valid frames for multi-modal responsive interaction,
dubbed ListenerX. Additionally, we propose VividListener, a novel framework
enabling fine-grained, expressive and controllable listener dynamics modeling.
This framework leverages multi-modal conditions as guiding principles for
fostering coherent interactions between speakers and listeners.Specifically, we
design the Responsive Interaction Module (RIM) to adaptively represent the
multi-modal interactive embeddings. RIM ensures the listener dynamics achieve
fine-grained semantic coordination with textual descriptions and adjustments,
while preserving expressive reaction with speaker behavior. Meanwhile, we
design the Emotional Intensity Tags (EIT) for emotion intensity editing with
multi-modal information integration, applying to both text descriptions and
listener motion amplitude.Extensive experiments conducted on our newly
collected ListenerX dataset demonstrate that VividListener achieves
state-of-the-art performance, realizing expressive and controllable listener
dynamics.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces vividlistener, a framework and a new large-scale dataset (listenerx) for generating expressive and controllable listener head dynamics in multi-modal dialogues, achieving state-of-the-art performance.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文介绍了vividlistener，一个用于在多模态对话中生成富有表现力和可控的听者头部动态的框架，以及一个新的大型数据集(listenerx)，并实现了最先进的性能。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star-half-alt"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(7/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.21718v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Shiying Li, Xingqun Qi, Bingkun Yang, Chen Weile, Zezhao Tian, Muyi Sun, Qifeng Liu, Man Zhang, Zhenan Sun</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.65, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Diffusion-based Adversarial Identity Manipulation for Facial Privacy Protection</h2>
            <p class="paper-summary">The success of face recognition (FR) systems has led to serious privacy
concerns due to potential unauthorized surveillance and user tracking on social
networks. Existing methods for enhancing privacy fail to generate natural face
images that can protect facial privacy. In this paper, we propose
diffusion-based adversarial identity manipulation (DiffAIM) to generate natural
and highly transferable adversarial faces against malicious FR systems. To be
specific, we manipulate facial identity within the low-dimensional latent space
of a diffusion model. This involves iteratively injecting gradient-based
adversarial identity guidance during the reverse diffusion process,
progressively steering the generation toward the desired adversarial faces. The
guidance is optimized for identity convergence towards a target while promoting
semantic divergence from the source, facilitating effective impersonation while
maintaining visual naturalness. We further incorporate structure-preserving
regularization to preserve facial structure consistency during manipulation.
Extensive experiments on both face verification and identification tasks
demonstrate that compared with the state-of-the-art, DiffAIM achieves stronger
black-box attack transferability while maintaining superior visual quality. We
also demonstrate the effectiveness of the proposed approach for commercial FR
APIs, including Face++ and Aliyun.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: this paper introduces diffaim, a diffusion-based method for generating adversarial faces that protect privacy by manipulating identity in the latent space, achieving strong black-box attack transferability and superior visual quality against face recognition systems.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 本文介绍了一种基于扩散的对抗性人脸操纵方法 diffaim，通过在潜在空间中操纵身份来生成保护隐私的对抗性人脸，从而实现强大的黑盒攻击可迁移性和针对人脸识别系统的卓越视觉质量。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(6/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star-half-alt"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(7/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.21646v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Liqin Wang, Qianyue Hu, Wei Lu, Xiangyang Luo</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.7000000000000001, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">DGSolver: Diffusion Generalist Solver with Universal Posterior Sampling for Image Restoration</h2>
            <p class="paper-summary">Diffusion models have achieved remarkable progress in universal image
restoration. While existing methods speed up inference by reducing sampling
steps, substantial step intervals often introduce cumulative errors. Moreover,
they struggle to balance the commonality of degradation representations and
restoration quality. To address these challenges, we introduce
\textbf{DGSolver}, a diffusion generalist solver with universal posterior
sampling. We first derive the exact ordinary differential equations for
generalist diffusion models and tailor high-order solvers with a queue-based
accelerated sampling strategy to improve both accuracy and efficiency. We then
integrate universal posterior sampling to better approximate
manifold-constrained gradients, yielding a more accurate noise estimation and
correcting errors in inverse inference. Extensive experiments show that
DGSolver outperforms state-of-the-art methods in restoration accuracy,
stability, and scalability, both qualitatively and quantitatively. Code and
models will be available at https://github.com/MiliLab/DGSolver.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces dgsolver, a diffusion model-based image restoration method that uses high-order solvers and universal posterior sampling to improve accuracy and efficiency, outperforming state-of-the-art methods.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文介绍了dgsolver，一种基于扩散模型的图像恢复方法，它使用高阶求解器和通用后验采样来提高准确性和效率，优于目前最好的方法。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(6/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star-half-alt"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(7/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.21487v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Hebaixu Wang, Jing Zhang, Haonan Guo, Di Wang, Jiayi Ma, Bo Du</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.75, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Revisiting Diffusion Autoencoder Training for Image Reconstruction Quality</h2>
            <p class="paper-summary">Diffusion autoencoders (DAEs) are typically formulated as a noise prediction
model and trained with a linear-$\beta$ noise schedule that spends much of its
sampling steps at high noise levels. Because high noise levels are associated
with recovering large-scale image structures and low noise levels with
recovering details, this configuration can result in low-quality and blurry
images. However, it should be possible to improve details while spending fewer
steps recovering structures because the latent code should already contain
structural information. Based on this insight, we propose a new DAE training
method that improves the quality of reconstructed images. We divide training
into two phases. In the first phase, the DAE is trained as a vanilla
autoencoder by always setting the noise level to the highest, forcing the
encoder and decoder to populate the latent code with structural information. In
the second phase, we incorporate a noise schedule that spends more time in the
low-noise region, allowing the DAE to learn how to perfect the details. Our
method results in images that have accurate high-level structures and low-level
details while still preserving useful properties of the latent codes.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper proposes a two-phase training method for diffusion autoencoders (daes), first focusing on structural information and then on fine details, to improve image reconstruction quality.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文提出了一种扩散自编码器（dae）的两阶段训练方法，首先侧重于结构信息，然后侧重于精细细节，以提高图像重建质量。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(6/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star-half-alt"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(7/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.21368v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Pramook Khungurn, Sukit Seripanitkarn, Phonphrm Thawatdamrongkit, Supasorn Suwajanakorn</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.8, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Text-Conditioned Diffusion Model for High-Fidelity Korean Font Generation</h2>
            <p class="paper-summary">Automatic font generation (AFG) is the process of creating a new font using
only a few examples of the style images. Generating fonts for complex languages
like Korean and Chinese, particularly in handwritten styles, presents
significant challenges. Traditional AFGs, like Generative adversarial networks
(GANs) and Variational Auto-Encoders (VAEs), are usually unstable during
training and often face mode collapse problems. They also struggle to capture
fine details within font images. To address these problems, we present a
diffusion-based AFG method which generates high-quality, diverse Korean font
images using only a single reference image, focusing on handwritten and printed
styles. Our approach refines noisy images incrementally, ensuring stable
training and visually appealing results. A key innovation is our text encoder,
which processes phonetic representations to generate accurate and contextually
correct characters, even for unseen characters. We used a pre-trained style
encoder from DG FONT to effectively and accurately encode the style images. To
further enhance the generation quality, we used perceptual loss that guides the
model to focus on the global style of generated images. Experimental results on
over 2000 Korean characters demonstrate that our model consistently generates
accurate and detailed font images and outperforms benchmark methods, making it
a reliable tool for generating authentic Korean fonts across different styles.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: this paper introduces a text-conditioned diffusion model for high-fidelity korean font generation, addressing the challenges of generating fonts, particularly handwritten styles, for complex languages and demonstrating improved performance over gans and vaes.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 本文介绍了一种文本条件扩散模型，用于生成高保真的韩国字体，解决了生成字体（尤其是手写风格字体）的难题，并且展示了相比gans和vaes的性能提升。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star-half-alt"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(7/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.21325v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Abdul Sami, Avinash Kumar, Irfanullah Memon, Youngwon Jo, Muhammad Rizwan, Jaeyoung Choi</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.8500000000000001, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">T2ID-CAS: Diffusion Model and Class Aware Sampling to Mitigate Class Imbalance in Neck Ultrasound Anatomical Landmark Detection</h2>
            <p class="paper-summary">Neck ultrasound (US) plays a vital role in airway management by providing
non-invasive, real-time imaging that enables rapid and precise interventions.
Deep learning-based anatomical landmark detection in neck US can further
facilitate procedural efficiency. However, class imbalance within datasets,
where key structures like tracheal rings and vocal folds are underrepresented,
presents significant challenges for object detection models. To address this,
we propose T2ID-CAS, a hybrid approach that combines a text-to-image latent
diffusion model with class-aware sampling to generate high-quality synthetic
samples for underrepresented classes. This approach, rarely explored in the
ultrasound domain, improves the representation of minority classes.
Experimental results using YOLOv9 for anatomical landmark detection in neck US
demonstrated that T2ID-CAS achieved a mean Average Precision of 88.2,
significantly surpassing the baseline of 66. This highlights its potential as a
computationally efficient and scalable solution for mitigating class imbalance
in AI-assisted ultrasound-guided interventions.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: this paper introduces t2id-cas, a hybrid approach combining text-to-image diffusion with class-aware sampling to address class imbalance in neck ultrasound anatomical landmark detection, showing significant improvement over a baseline yolov9 model.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 本文介绍了一种名为t2id-cas的混合方法，该方法结合了文本到图像的扩散模型和类感知采样，以解决颈部超声解剖标志物检测中的类别不平衡问题，并且相比yolov9基线模型有显著改进。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star-half-alt"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(7/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.21231v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Manikanta Varaganti, Amulya Vankayalapati, Nour Awad, Gregory R. Dion, Laura J. Brattain</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.9, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Embracing Collaboration Over Competition: Condensing Multiple Prompts for Visual In-Context Learning</h2>
            <p class="paper-summary">Visual In-Context Learning (VICL) enables adaptively solving vision tasks by
leveraging pixel demonstrations, mimicking human-like task completion through
analogy. Prompt selection is critical in VICL, but current methods assume the
existence of a single "ideal" prompt in a pool of candidates, which in practice
may not hold true. Multiple suitable prompts may exist, but individually they
often fall short, leading to difficulties in selection and the exclusion of
useful context. To address this, we propose a new perspective: prompt
condensation. Rather than relying on a single prompt, candidate prompts
collaborate to efficiently integrate informative contexts without sacrificing
resolution. We devise Condenser, a lightweight external plugin that compresses
relevant fine-grained context across multiple prompts. Optimized end-to-end
with the backbone, Condenser ensures accurate integration of contextual cues.
Experiments demonstrate Condenser outperforms state-of-the-arts across
benchmark tasks, showing superior context compression, scalability with more
prompts, and enhanced computational efficiency compared to ensemble methods,
positioning it as a highly competitive solution for VICL. Code is open-sourced
at https://github.com/gimpong/CVPR25-Condenser.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces 'condenser,' a novel plugin for visual in-context learning (vicl) that compresses information from multiple prompts, outperforming existing methods on benchmark tasks.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文介绍了一种名为'condenser'的新型插件，用于视觉上下文学习（vicl），它可以压缩来自多个提示的信息，并在基准任务上优于现有方法。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(3/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(6/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.21263v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Jinpeng Wang, Tianci Luo, Yaohua Zha, Yan Feng, Ruisheng Luo, Bin Chen, Tao Dai, Long Chen, Yaowei Wang, Shu-Tao Xia</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.9500000000000001, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">LoC-LIC: Low Complexity Learned Image Coding Using Hierarchical Feature Transforms</h2>
            <p class="paper-summary">Current learned image compression models typically exhibit high complexity,
which demands significant computational resources. To overcome these
challenges, we propose an innovative approach that employs hierarchical feature
extraction transforms to significantly reduce complexity while preserving bit
rate reduction efficiency. Our novel architecture achieves this by using fewer
channels for high spatial resolution inputs/feature maps. On the other hand,
feature maps with a large number of channels have reduced spatial dimensions,
thereby cutting down on computational load without sacrificing performance.
This strategy effectively reduces the forward pass complexity from \(1256 \,
\text{kMAC/Pixel}\) to just \(270 \, \text{kMAC/Pixel}\). As a result, the
reduced complexity model can open the way for learned image compression models
to operate efficiently across various devices and pave the way for the
development of new architectures in image compression technology.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: this paper proposes a low-complexity learned image compression method using hierarchical feature transforms, significantly reducing computational requirements while maintaining bit rate reduction efficiency.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 本文提出了一种基于分层特征变换的低复杂度图像压缩方法，显著降低了计算需求，同时保持了比特率降低效率。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(3/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(6/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star-half-alt"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(5/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.21778v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Ayman A. Ameen, Thomas Richter, André Kaup</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 1.0, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">eNCApsulate: NCA for Precision Diagnosis on Capsule Endoscopes</h2>
            <p class="paper-summary">Wireless Capsule Endoscopy is a non-invasive imaging method for the entire
gastrointestinal tract, and is a pain-free alternative to traditional
endoscopy. It generates extensive video data that requires significant review
time, and localizing the capsule after ingestion is a challenge. Techniques
like bleeding detection and depth estimation can help with localization of
pathologies, but deep learning models are typically too large to run directly
on the capsule. Neural Cellular Automata (NCA) for bleeding segmentation and
depth estimation are trained on capsule endoscopic images. For monocular depth
estimation, we distill a large foundation model into the lean NCA architecture,
by treating the outputs of the foundation model as pseudo ground truth. We then
port the trained NCA to the ESP32 microcontroller, enabling efficient image
processing on hardware as small as a camera capsule. NCA are more accurate
(Dice) than other portable segmentation models, while requiring more than 100x
fewer parameters stored in memory than other small-scale models. The visual
results of NCA depth estimation look convincing, and in some cases beat the
realism and detail of the pseudo ground truth. Runtime optimizations on the
ESP32-S3 accelerate the average inference speed significantly, by more than
factor 3. With several algorithmic adjustments and distillation, it is possible
to eNCApsulate NCA models into microcontrollers that fit into wireless capsule
endoscopes. This is the first work that enables reliable bleeding segmentation
and depth estimation on a miniaturized device, paving the way for precise
diagnosis combined with visual odometry as a means of precise localization of
the capsule -- on the capsule.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: this paper introduces an efficient neural cellular automata (nca) approach for bleeding segmentation and depth estimation on capsule endoscopes, enabling on-device processing with significantly reduced parameters and improved accuracy compared to other portable models.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文介绍了一种高效的神经元胞自动机（nca）方法，用于在胶囊内窥镜上进行出血分割和深度估计，与其它便携式模型相比，它能够以显著减少的参数和提高的精度实现设备上的处理。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(2/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(4/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.21562v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Henry John Krumb, Anirban Mukhopadhyay</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 1.05, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Towards Improved Cervical Cancer Screening: Vision Transformer-Based Classification and Interpretability</h2>
            <p class="paper-summary">We propose a novel approach to cervical cell image classification for
cervical cancer screening using the EVA-02 transformer model. We developed a
four-step pipeline: fine-tuning EVA-02, feature extraction, selecting important
features through multiple machine learning models, and training a new
artificial neural network with optional loss weighting for improved
generalization. With this design, our best model achieved an F1-score of
0.85227, outperforming the baseline EVA-02 model (0.84878). We also utilized
Kernel SHAP analysis and identified key features correlating with cell
morphology and staining characteristics, providing interpretable insights into
the decision-making process of the fine-tuned model. Our code is available at
https://github.com/Khoa-NT/isbi2025_ps3c.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: this paper presents a vision transformer-based pipeline for cervical cancer screening, achieving a slightly improved f1-score compared to the baseline and providing interpretability through shap analysis. the code is publicly available.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 本文提出了一种基于vision transformer的宫颈癌筛查流程，与基线相比，f1分数略有提高，并通过shap分析提供可解释性。代码已公开。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(2/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(6/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(4/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.21340v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Khoa Tuan Nguyen, Ho-min Park, Gaeun Oh, Joris Vankerschaver, Wesley De Neve</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 1.1, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Legilimens: Performant Video Analytics on the System-on-Chip Edge</h2>
            <p class="paper-summary">Continually retraining models has emerged as a primary technique to enable
high-accuracy video analytics on edge devices. Yet, existing systems employ
such adaptation by relying on the spare compute resources that traditional
(memory-constrained) edge servers afford. In contrast, mobile edge devices such
as drones and dashcams offer a fundamentally different resource profile:
weak(er) compute with abundant unified memory pools. We present Legilimens, a
continuous learning system for the mobile edge's System-on-Chip GPUs. Our
driving insight is that visually distinct scenes that require retraining
exhibit substantial overlap in model embeddings; if captured into a base model
on device memory, specializing to each new scene can become lightweight,
requiring very few samples. To practically realize this approach, Legilimens
presents new, compute-efficient techniques to (1) select high-utility data
samples for retraining specialized models, (2) update the base model without
complete retraining, and (3) time-share compute resources between retraining
and live inference for maximal accuracy. Across diverse workloads, Legilimens
lowers retraining costs by 2.8-10x compared to existing systems, resulting in
18-45% higher accuracies.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: this paper presents legilimens, a continuous learning system for video analytics on mobile edge devices, which reduces retraining costs and improves accuracy by exploiting overlapping model embeddings and employing compute-efficient techniques.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 本文介绍了legilimens，一个用于移动边缘设备上视频分析的持续学习系统。该系统通过利用重叠的模型嵌入和采用计算高效的技术，从而降低了重新训练的成本并提高了准确性。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(3/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(6/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(4/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.21136v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Murali Ramanujam, Yinwei Dai, Kyle Jamieson, Ravi Netravali</p>
            
        </motion.div>
        
    </div>

    <footer class="footer">
        Generated on 2025-05-03 04:28:35 UTC. Powered by <a href="https://github.com/onion-liu" target="_blank">onion-liu</a>.
    </footer>

</body>
</html>